Load necessary packages.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.2
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(ggwordcloud)
## Warning: package 'ggwordcloud' was built under R version 4.3.2
library(stringi)
Read the dataset and drop NA vlaues.
youtube_df <- read_csv("Data/cleaned_youtube_df.csv") %>%
janitor::clean_names() %>%
filter(created_year >= 2005) %>%
drop_na()
## Rows: 995 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): category, country, abbreviation, channel_type
## dbl (14): id, subscribers, video_views, uploads, video_views_for_the_last_30...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(youtube_df)
## id subscribers video_views category
## Min. : 1.0 Min. : 12300000 Min. :2.634e+03 Length:588
## 1st Qu.:214.5 1st Qu.: 14700000 1st Qu.:4.920e+09 Class :character
## Median :454.5 Median : 18600000 Median :8.770e+09 Mode :character
## Mean :471.8 Mean : 24244048 Mean :1.282e+10
## 3rd Qu.:723.2 3rd Qu.: 26250000 3rd Qu.:1.512e+10
## Max. :995.0 Max. :245000000 Max. :2.280e+11
## uploads country abbreviation channel_type
## Min. : 1 Length:588 Length:588 Length:588
## 1st Qu.: 430 Class :character Class :character Class :character
## Median : 1194 Mode :character Mode :character Mode :character
## Mean : 13968
## 3rd Qu.: 3886
## Max. :301308
## video_views_for_the_last_30_days lowest_monthly_earnings
## Min. :3.000e+00 Min. : 0
## 1st Qu.:4.867e+07 1st Qu.: 11500
## Median :1.145e+08 Median : 28200
## Mean :2.450e+08 Mean : 54343
## 3rd Qu.:2.443e+08 3rd Qu.: 59025
## Max. :6.589e+09 Max. :850900
## highest_monthly_earnings lowest_yearly_earnings highest_yearly_earnings
## Min. : 0 Min. : 0 Min. : 0
## 1st Qu.: 183200 1st Qu.: 137400 1st Qu.: 2200000
## Median : 451500 Median : 338650 Median : 5400000
## Mean : 868831 Mean : 651617 Mean : 10433733
## 3rd Qu.: 944300 3rd Qu.: 708225 3rd Qu.: 11300000
## Max. :13600000 Max. :10200000 Max. :163400000
## subscribers_for_last_30_days created_year population
## Min. : 1 Min. :2005 Min. :2.025e+05
## 1st Qu.: 100000 1st Qu.:2010 1st Qu.:1.081e+08
## Median : 200000 Median :2013 Median :3.282e+08
## Mean : 356791 Mean :2013 Mean :4.909e+08
## 3rd Qu.: 400000 3rd Qu.:2016 3rd Qu.:3.282e+08
## Max. :8000000 Max. :2022 Max. :1.398e+09
## latitude longitude
## Min. :-38.42 Min. :-172.105
## 1st Qu.: 20.59 1st Qu.: -95.713
## Median : 30.48 Median : -3.436
## Mean : 26.19 Mean : -6.101
## 3rd Qu.: 37.09 3rd Qu.: 78.963
## Max. : 61.52 Max. : 138.253
We use histogram and density plot to intuitively visualize the distribution of different variables.
all_columns <- colnames(youtube_df)
columns_to_plot <- all_columns[!all_columns %in% c("id", "category","country","abbreviation","channel_type","population","latitude","longitude","created_year")]
numeric_data_long <-
youtube_df[, columns_to_plot] %>%
gather(key = "variable", value = "value")
# Create a single plot with facets for each numeric variable
p <- ggplot(numeric_data_long, aes(x = value)) +
geom_histogram(aes(y = ..density..),bins = 15, fill = "#8dab7f", alpha = 0.8) +
geom_density(color="#6b8e23")+
facet_wrap(~ variable, scales = "free", ncol = 3) +
scale_x_continuous(labels = scales::comma) +
theme_minimal(base_size = 10) +
theme(
strip.text.x = element_text(size = 10, face = "bold"),
axis.text.x = element_text(angle = 20, hjust = 1, vjust = 1,size=7,face = "bold"), # Angle x-axis labels for readability
axis.title.x = element_text(size = 12),
axis.title.y = element_text(size = 12),
plot.title = element_text(size = 16, face = "bold"),
plot.margin = margin(1, 1, 1, 1, "cm"), # Adjust the plot margins
strip.background = element_blank(),
panel.spacing = unit(3, "lines")
) +
labs(
title = "Distribution of Numeric Variables",
x = "Value",
y = "Frequency",
caption = "Source: YouTube Data"
)
# Convert to an interactive plot
ggplotly(p)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## ℹ The deprecated feature was likely used in the ggplot2 package.
## Please report the issue at <https://github.com/tidyverse/ggplot2/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
We plot heatmap to study the relationships among numerical variables.
# Calculate the correlation matrix
cor_matrix <- cor(youtube_df[, columns_to_plot], use = "complete.obs")
fig <- plot_ly(x = colnames(cor_matrix), y = rownames(cor_matrix), z = cor_matrix,
type = "heatmap",colorscale ="Greens" , zmin = -1, zmax = 1)
fig <- fig %>% layout(
yaxis = list(autorange = "reversed"),
width=800,
height=600,
title = "Correlation Matrix")
## Warning: Specifying width/height in layout() is now deprecated.
## Please specify in ggplotly() or plot_ly()
fig
Wordcloud for channel category.
category_data <- youtube_df %>%
filter(!is.na(category) & category != "nan") %>%
count(category) %>%
mutate(n=n*30) %>%
ungroup()
category_data$scaled_size <- log(category_data$n + 1) # adding 1 to avoid log(0)
wordcloud_plot <- ggplot(category_data, aes(label = category, size = scaled_size)) +
geom_text_wordcloud(
aes(color = n),
shape = 'circle',
rm_outside = TRUE
) +
scale_size_area(max_size = 10) +
scale_color_gradient(low = "#ffcc99", high = "#8dab7f") +
theme_void(base_family = "sans") +
theme(legend.position = "none",
plot.margin = margin(1, 1, 1, 1, "cm")) # Adjust margins around the plot
# Display the plot
wordcloud_plot
Pie chart for channel type.
channel_type_counts <- table(youtube_df$channel_type)
channel_type_counts <- youtube_df %>%
group_by(channel_type) %>%
summarise(count = n()) %>%
ungroup()
color <- c("#ffcc99","#ffe4b5", "#ffd180","#ffa07a","#d1d17a", "#8dab7f", "#D2DFD9", "#A8C0B5", "#D1B9CB", "#B3CDD1", "#BBC1D0", "#E8C3C3","#C7CEBD", "#D2DFD9","#6b8e23")
# Create a pie chart using plotly with the custom colors
fig <- plot_ly(channel_type_counts, labels = ~channel_type, values = ~count, type = 'pie',
textinfo = 'label+percent',
insidetextorientation = 'radial',
marker = list(colors = color))
fig %>%
layout(title = 'Pie Chart of Channel Types',
showlegend = FALSE,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
Bubble Chart: Subscribers vs. Video Views by Category
size_factor <- 10^(-8.7) # Adjust this factor as needed to scale sizes up or down
youtube_df %>%
filter(!is.na(category) & category != "nan") %>%
plot_ly(
x = ~subscribers,
y = ~video_views,
size = ~video_views_for_the_last_30_days * size_factor,
color = ~category,
text = ~category,
hoverinfo = 'text+x+y',
type = 'scatter',
mode = 'markers',
marker = list(
sizemode = 'area',
sizeref = 2 * max(youtube_df$video_views_for_the_last_30_days * size_factor)/100
)
) %>%
layout(
title = 'Subscribers vs. Video Views by Category',
xaxis = list(type = 'log', title = 'Subscribers'),
yaxis = list(type = 'log', title = 'Video Views (in billions)'),
hovermode = 'closest',
showlegend = TRUE
)
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
Here I want to investigate the top 10 YouTube channels by the highest yearly earnings. I’ll load the original dataset again.
color <- c("#ffe4b5", "#ffa07a","#d1d17a" , "#D2DFD9", "#A8C0B5", "#D1B9CB", "#B3CDD1", "#BBC1D0", "#E8C3C3")
earning_plot_data <-
read_csv("Data/Global YouTube Statistics.csv",locale = locale(encoding = "Windows-1252")) %>%
janitor::clean_names() %>%
drop_na() %>%
select(youtuber, channel_type, highest_yearly_earnings) %>%
mutate(youtuber = stringi::stri_replace_all_regex(youtuber, "[^\x01-\x7F]", "")) %>%
arrange(desc(highest_yearly_earnings)) %>%
top_n(15, highest_yearly_earnings)
## Rows: 995 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Youtuber, category, Title, Country, Abbreviation, channel_type, cr...
## dbl (21): rank, subscribers, video views, uploads, video_views_rank, country...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
plot_ly(earning_plot_data, x = ~highest_yearly_earnings, y = ~youtuber,
type = 'bar', orientation = 'h',
color = ~channel_type, colors = color,
text = ~paste('$', formatC(highest_yearly_earnings, format = "d", big.mark = ",")),
textposition = 'inside',
insidetextanchor = 'end',
textfont = list(color = 'white'), # text color
hoverinfo = 'text',
hovertemplate = paste('<b>Youtuber:</b> %{y}<br>',
'<b>Earnings:</b> $%{x}<extra></extra>')) %>%
layout(title = 'Top 15 YouTube Channels by Highest Yearly Earnings',
xaxis = list(title = 'Yearly Earnings ($)'),
yaxis = list(title = ''),
showlegend = TRUE,
margin = list(l = 100, r = 25, t = 50, b = 50),
font = list(family = "Arial, sans-serif", size = 12, color = "#333333"))